# 编码为Wordpiece
import sys
import sentencepiece as spm

text_in = sys.argv[1]
spm_model = sys.argv[2]
text_out = sys.argv[3]

encoder = spm.SentencePieceProcessor(spm_model)

with open(text_in, 'r', encoding='utf-8') as f:
    with open(text_out, 'w', encoding='utf-8') as w:
        for line in f:
            parts = line.strip().split()
            utt_id = parts[0]
            sent = ' '.join(parts[1:])
            enc_text = encoder.encode(sent, out_type=str)
            w.write(utt_id+' '+' '.join(enc_text)+'\n')


